{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Generative Chat Bot"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step1 Import packages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "data dir: ['README.md', 'alpaca_data_zh_51k.json', '.gitattributes']\n",
      "model dir: ['.cache', 'README.md', 'tokenizer_config.json', '.gitattributes', 'tokenizer.json', 'pytorch_model.bin', 'special_tokens_map.json', 'config.json']\n",
      "evaluate dir: ['super_glue', 'smape', 'mase', 'charcut_mt', 'wiki_split', 'mean_iou', 'ter', 'bertscore', 'glue', 'bleurt', 'frugalscore', 'matthews_correlation', 'cer', 'rl_reliability', 'xtreme_s', 'perplexity', 'sari', 'character', 'mahalanobis', 'seqeval', 'indic_glue', 'meteor', 'google_bleu', 'squad_v2', 'competition_math', 'squad', 'nist_mt', 'r_squared', 'confusion_matrix', 'roc_auc', 'f1', 'recall', 'mape', 'chrf', 'pearsonr', 'spearmanr', 'sacrebleu', 'wer', 'xnli', 'mse', 'exact_match', 'mae', 'coval', 'code_eval', 'mauve', 'trec_eval', 'cuad', 'poseval', 'brier_score', 'precision', 'comet', 'accuracy', 'bleu', 'rouge']\n"
     ]
    }
   ],
   "source": [
    "from datasets import load_dataset\n",
    "from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, Pipeline, DataCollatorForSeq2Seq\n",
    "import os\n",
    "data_dir = os.path.expanduser(\"~/datasets/llamafactory/alpaca_zh\")\n",
    "model_dir = os.path.expanduser(\"~/models/Langboat/bloom-389m-zh\")\n",
    "evaluate_dir = os.path.expanduser(\"~/.cache/huggingface/evaluate/metrics\")\n",
    "print(\"data dir:\", os.listdir(data_dir))\n",
    "print(\"model dir:\", os.listdir(model_dir))\n",
    "print(\"evaluate dir:\", os.listdir(evaluate_dir))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step2 Load dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_name = 'alpaca_data_zh_51k.json'\n",
    "data_path = os.path.join(data_dir, data_name)\n",
    "data = load_dataset('json', data_files=data_path, split='train')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'instruction': '我们如何在日常生活中减少用水？',\n",
       " 'input': '',\n",
       " 'output': '1. 使用节水装置，如节水淋浴喷头和水龙头。 \\n2. 使用水箱或水桶收集家庭废水，例如洗碗和洗浴。 \\n3. 在社区中提高节水意识。 \\n4. 检查水管和灌溉系统的漏水情况，并及时修复它们。 \\n5. 洗澡时间缩短，使用低流量淋浴头节约用水。 \\n6. 收集雨水，用于园艺或其他非饮用目的。 \\n7. 刷牙或擦手时关掉水龙头。 \\n8. 减少浇水草坪的时间。 \\n9. 尽可能多地重复使用灰水（来自洗衣机、浴室水槽和淋浴的水）。 \\n10. 只购买能源效率高的洗碗机和洗衣机。'}"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step3 Data preprocess"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "BloomTokenizerFast(name_or_path='/home/sxliu/models/Langboat/bloom-389m-zh', vocab_size=42437, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={\n",
       "\t0: AddedToken(\"<unk>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t1: AddedToken(\"<s>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t2: AddedToken(\"</s>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "\t3: AddedToken(\"<pad>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
       "}"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_dir)\n",
    "tokenizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def format_input(example):\n",
    "    instruction_text = f\"Human:\\n{example['instruction']}\"\n",
    "    input_text = f\"\\n\\nInput:\\n{example['input']}\" if example['input'].strip() else \"\"\n",
    "    response_text = f\"\\n\\nAssistant:\\n{example['output']}\"\n",
    "    model_input = tokenizer(instruction_text + input_text + response_text)\n",
    "    model_input[\"labels\"] = model_input[\"input_ids\"][1:] + tokenizer.encode(tokenizer.eos_token)\n",
    "    \n",
    "    return model_input"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['input_ids', 'attention_mask', 'labels'],\n",
       "    num_rows: 51155\n",
       "})"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenized_data = data.map(format_input, remove_columns=data.column_names)\n",
    "tokenized_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Human:\\n我们如何在日常生活中减少用水？\\n\\nAssistant:\\n1. 使用节水装置，如节水淋浴喷头和水龙头。 \\n2. 使用水箱或水桶收集家庭废水，例如洗碗和洗浴。 \\n3. 在社区中提高节水意识。 \\n4. 检查水管和灌溉系统的漏水情况，并及时修复它们。 \\n5. 洗澡时间缩短，使用低流量淋浴头节约用水。 \\n6. 收集雨水，用于园艺或其他非饮用目的。 \\n7. 刷牙或擦手时关掉水龙头。 \\n8. 减少浇水草坪的时间。 \\n9. 尽可能多地重复使用灰水（来自洗衣机、浴室水槽和淋浴的水）。 \\n10. 只购买能源效率高的洗碗机和洗衣机。'"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.decode(tokenized_data[0]['input_ids'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "':\\n我们如何在日常生活中减少用水？\\n\\nAssistant:\\n1. 使用节水装置，如节水淋浴喷头和水龙头。 \\n2. 使用水箱或水桶收集家庭废水，例如洗碗和洗浴。 \\n3. 在社区中提高节水意识。 \\n4. 检查水管和灌溉系统的漏水情况，并及时修复它们。 \\n5. 洗澡时间缩短，使用低流量淋浴头节约用水。 \\n6. 收集雨水，用于园艺或其他非饮用目的。 \\n7. 刷牙或擦手时关掉水龙头。 \\n8. 减少浇水草坪的时间。 \\n9. 尽可能多地重复使用灰水（来自洗衣机、浴室水槽和淋浴的水）。 \\n10. 只购买能源效率高的洗碗机和洗衣机。</s>'"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.decode(tokenized_data['labels'][0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['input_ids', 'attention_mask', 'labels'],\n",
       "        num_rows: 46039\n",
       "    })\n",
       "    test: Dataset({\n",
       "        features: ['input_ids', 'attention_mask', 'labels'],\n",
       "        num_rows: 5116\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset = tokenized_data.train_test_split(test_size=0.1)\n",
    "dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "collator = DataCollatorForSeq2Seq(tokenizer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "batch = collator(dataset['test'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'input_ids': tensor([[    3,     3,     3,  ..., 19097,  6362,   420],\n",
       "        [    3,     3,     3,  ..., 18314,   488,   420],\n",
       "        [    3,     3,     3,  ...,   583,    18,  3800],\n",
       "        ...,\n",
       "        [    3,     3,     3,  ...,  3515, 10322,   420],\n",
       "        [    3,     3,     3,  ..., 15599,  5753,   420],\n",
       "        [    3,     3,     3,  ..., 36507, 28223,   420]]), 'attention_mask': tensor([[0, 0, 0,  ..., 1, 1, 1],\n",
       "        [0, 0, 0,  ..., 1, 1, 1],\n",
       "        [0, 0, 0,  ..., 1, 1, 1],\n",
       "        ...,\n",
       "        [0, 0, 0,  ..., 1, 1, 1],\n",
       "        [0, 0, 0,  ..., 1, 1, 1],\n",
       "        [0, 0, 0,  ..., 1, 1, 1]]), 'labels': tensor([[ -100,  -100,  -100,  ...,  6362,   420,     2],\n",
       "        [ -100,  -100,  -100,  ...,   488,   420,     2],\n",
       "        [ -100,  -100,  -100,  ...,    18,  3800,     2],\n",
       "        ...,\n",
       "        [ -100,  -100,  -100,  ..., 10322,   420,     2],\n",
       "        [ -100,  -100,  -100,  ...,  5753,   420,     2],\n",
       "        [ -100,  -100,  -100,  ..., 28223,   420,     2]])}"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "batch"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step4 Create model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/sxliu/miniconda3/envs/torch2.4/lib/python3.9/site-packages/torch/_utils.py:776: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
      "  return self.fget.__get__(instance, owner)()\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "BloomForCausalLM(\n",
       "  (transformer): BloomModel(\n",
       "    (word_embeddings): Embedding(42437, 1024)\n",
       "    (word_embeddings_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
       "    (h): ModuleList(\n",
       "      (0-23): 24 x BloomBlock(\n",
       "        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
       "        (self_attention): BloomAttention(\n",
       "          (query_key_value): Linear(in_features=1024, out_features=3072, bias=True)\n",
       "          (dense): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "          (attention_dropout): Dropout(p=0.0, inplace=False)\n",
       "        )\n",
       "        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
       "        (mlp): BloomMLP(\n",
       "          (dense_h_to_4h): Linear(in_features=1024, out_features=4096, bias=True)\n",
       "          (gelu_impl): BloomGelu()\n",
       "          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)\n",
       "        )\n",
       "      )\n",
       "    )\n",
       "    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
       "  )\n",
       "  (lm_head): Linear(in_features=1024, out_features=42437, bias=False)\n",
       ")"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model = AutoModelForCausalLM.from_pretrained(model_dir)\n",
    "model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step5 Configure TrainningArguments"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "TrainingArguments(\n",
       "_n_gpu=1,\n",
       "accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},\n",
       "adafactor=False,\n",
       "adam_beta1=0.9,\n",
       "adam_beta2=0.999,\n",
       "adam_epsilon=1e-08,\n",
       "auto_find_batch_size=False,\n",
       "batch_eval_metrics=False,\n",
       "bf16=False,\n",
       "bf16_full_eval=False,\n",
       "data_seed=None,\n",
       "dataloader_drop_last=False,\n",
       "dataloader_num_workers=0,\n",
       "dataloader_persistent_workers=False,\n",
       "dataloader_pin_memory=True,\n",
       "dataloader_prefetch_factor=None,\n",
       "ddp_backend=None,\n",
       "ddp_broadcast_buffers=None,\n",
       "ddp_bucket_cap_mb=None,\n",
       "ddp_find_unused_parameters=None,\n",
       "ddp_timeout=1800,\n",
       "debug=[],\n",
       "deepspeed=None,\n",
       "disable_tqdm=False,\n",
       "dispatch_batches=None,\n",
       "do_eval=True,\n",
       "do_predict=False,\n",
       "do_train=False,\n",
       "eval_accumulation_steps=None,\n",
       "eval_delay=0,\n",
       "eval_do_concat_batches=True,\n",
       "eval_on_start=False,\n",
       "eval_steps=500,\n",
       "eval_strategy=steps,\n",
       "eval_use_gather_object=False,\n",
       "evaluation_strategy=None,\n",
       "fp16=False,\n",
       "fp16_backend=auto,\n",
       "fp16_full_eval=False,\n",
       "fp16_opt_level=O1,\n",
       "fsdp=[],\n",
       "fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},\n",
       "fsdp_min_num_params=0,\n",
       "fsdp_transformer_layer_cls_to_wrap=None,\n",
       "full_determinism=False,\n",
       "gradient_accumulation_steps=1,\n",
       "gradient_checkpointing=False,\n",
       "gradient_checkpointing_kwargs=None,\n",
       "greater_is_better=None,\n",
       "group_by_length=False,\n",
       "half_precision_backend=auto,\n",
       "hub_always_push=False,\n",
       "hub_model_id=None,\n",
       "hub_private_repo=False,\n",
       "hub_strategy=every_save,\n",
       "hub_token=<HUB_TOKEN>,\n",
       "ignore_data_skip=False,\n",
       "include_for_metrics=[],\n",
       "include_inputs_for_metrics=False,\n",
       "include_num_input_tokens_seen=False,\n",
       "include_tokens_per_second=False,\n",
       "jit_mode_eval=False,\n",
       "label_names=None,\n",
       "label_smoothing_factor=0.0,\n",
       "learning_rate=5e-05,\n",
       "length_column_name=length,\n",
       "load_best_model_at_end=False,\n",
       "local_rank=0,\n",
       "log_level=passive,\n",
       "log_level_replica=warning,\n",
       "log_on_each_node=True,\n",
       "logging_dir=./checkpints/runs/Nov07_19-13-35_cu01,\n",
       "logging_first_step=False,\n",
       "logging_nan_inf_filter=True,\n",
       "logging_steps=500,\n",
       "logging_strategy=steps,\n",
       "lr_scheduler_kwargs={},\n",
       "lr_scheduler_type=linear,\n",
       "max_grad_norm=1.0,\n",
       "max_steps=-1,\n",
       "metric_for_best_model=None,\n",
       "mp_parameters=,\n",
       "neftune_noise_alpha=None,\n",
       "no_cuda=False,\n",
       "num_train_epochs=3,\n",
       "optim=adamw_torch,\n",
       "optim_args=None,\n",
       "optim_target_modules=None,\n",
       "output_dir=./checkpints,\n",
       "overwrite_output_dir=False,\n",
       "past_index=-1,\n",
       "per_device_eval_batch_size=8,\n",
       "per_device_train_batch_size=8,\n",
       "prediction_loss_only=False,\n",
       "push_to_hub=False,\n",
       "push_to_hub_model_id=None,\n",
       "push_to_hub_organization=None,\n",
       "push_to_hub_token=<PUSH_TO_HUB_TOKEN>,\n",
       "ray_scope=last,\n",
       "remove_unused_columns=True,\n",
       "report_to=['tensorboard'],\n",
       "restore_callback_states_from_checkpoint=False,\n",
       "resume_from_checkpoint=None,\n",
       "run_name=./checkpints,\n",
       "save_on_each_node=False,\n",
       "save_only_model=False,\n",
       "save_safetensors=True,\n",
       "save_steps=500,\n",
       "save_strategy=epoch,\n",
       "save_total_limit=None,\n",
       "seed=42,\n",
       "skip_memory_metrics=True,\n",
       "split_batches=None,\n",
       "tf32=None,\n",
       "torch_compile=False,\n",
       "torch_compile_backend=None,\n",
       "torch_compile_mode=None,\n",
       "torch_empty_cache_steps=None,\n",
       "torchdynamo=None,\n",
       "tpu_metrics_debug=False,\n",
       "tpu_num_cores=None,\n",
       "use_cpu=False,\n",
       "use_ipex=False,\n",
       "use_legacy_prediction_loop=False,\n",
       "use_liger_kernel=False,\n",
       "use_mps_device=False,\n",
       "warmup_ratio=0.0,\n",
       "warmup_steps=0,\n",
       "weight_decay=0.0,\n",
       ")"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_args = TrainingArguments(\n",
    "    output_dir=\"./checkpoints\",\n",
    "    num_train_epochs=3,\n",
    "    per_device_train_batch_size=8,\n",
    "    per_device_eval_batch_size=8,\n",
    "    save_strategy='epoch',\n",
    "    eval_strategy='steps',\n",
    ")\n",
    "train_args"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step6 Create Trainer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.\n"
     ]
    }
   ],
   "source": [
    "trainer = Trainer(\n",
    "    model=model,\n",
    "    args=train_args,\n",
    "    train_dataset=dataset['train'],\n",
    "    eval_dataset=dataset['test'],\n",
    "    data_collator=DataCollatorForSeq2Seq(tokenizer),\n",
    "    \n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step7 Model trainning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='235' max='17265' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [  235/17265 02:00 < 2:27:10, 1.93 it/s, Epoch 0.04/3]\n",
       "    </div>\n",
       "    <table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       " <tr style=\"text-align: left;\">\n",
       "      <th>Step</th>\n",
       "      <th>Training Loss</th>\n",
       "      <th>Validation Loss</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table><p>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "ename": "OutOfMemoryError",
     "evalue": "CUDA out of memory. Tried to allocate 498.00 MiB (GPU 0; 23.70 GiB total capacity; 11.46 GiB already allocated; 489.75 MiB free; 12.54 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mOutOfMemoryError\u001b[0m                          Traceback (most recent call last)",
      "Input \u001b[0;32mIn [18]\u001b[0m, in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/miniconda3/envs/torch2.4/lib/python3.9/site-packages/transformers/trainer.py:2122\u001b[0m, in \u001b[0;36mTrainer.train\u001b[0;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001b[0m\n\u001b[1;32m   2120\u001b[0m         hf_hub_utils\u001b[38;5;241m.\u001b[39menable_progress_bars()\n\u001b[1;32m   2121\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 2122\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43minner_training_loop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   2123\u001b[0m \u001b[43m        \u001b[49m\u001b[43margs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2124\u001b[0m \u001b[43m        \u001b[49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2125\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtrial\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrial\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2126\u001b[0m \u001b[43m        \u001b[49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2127\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/miniconda3/envs/torch2.4/lib/python3.9/site-packages/transformers/trainer.py:2474\u001b[0m, in \u001b[0;36mTrainer._inner_training_loop\u001b[0;34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001b[0m\n\u001b[1;32m   2471\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcallback_handler\u001b[38;5;241m.\u001b[39mon_step_begin(args, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol)\n\u001b[1;32m   2473\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maccelerator\u001b[38;5;241m.\u001b[39maccumulate(model):\n\u001b[0;32m-> 2474\u001b[0m     tr_loss_step \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtraining_step\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnum_items_in_batch\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   2476\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m   2477\u001b[0m     args\u001b[38;5;241m.\u001b[39mlogging_nan_inf_filter\n\u001b[1;32m   2478\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_torch_xla_available()\n\u001b[1;32m   2479\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m (torch\u001b[38;5;241m.\u001b[39misnan(tr_loss_step) \u001b[38;5;129;01mor\u001b[39;00m torch\u001b[38;5;241m.\u001b[39misinf(tr_loss_step))\n\u001b[1;32m   2480\u001b[0m ):\n\u001b[1;32m   2481\u001b[0m     \u001b[38;5;66;03m# if loss is nan or inf simply add the average of previous logged losses\u001b[39;00m\n\u001b[1;32m   2482\u001b[0m     tr_loss \u001b[38;5;241m=\u001b[39m tr_loss \u001b[38;5;241m+\u001b[39m tr_loss \u001b[38;5;241m/\u001b[39m (\u001b[38;5;241m1\u001b[39m \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mglobal_step \u001b[38;5;241m-\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_globalstep_last_logged)\n",
      "File \u001b[0;32m~/miniconda3/envs/torch2.4/lib/python3.9/site-packages/transformers/trainer.py:3572\u001b[0m, in \u001b[0;36mTrainer.training_step\u001b[0;34m(self, model, inputs, num_items_in_batch)\u001b[0m\n\u001b[1;32m   3569\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m loss_mb\u001b[38;5;241m.\u001b[39mreduce_mean()\u001b[38;5;241m.\u001b[39mdetach()\u001b[38;5;241m.\u001b[39mto(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mdevice)\n\u001b[1;32m   3571\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcompute_loss_context_manager():\n\u001b[0;32m-> 3572\u001b[0m     loss \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompute_loss\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnum_items_in_batch\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mnum_items_in_batch\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   3574\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m inputs\n\u001b[1;32m   3575\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m   3576\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mtorch_empty_cache_steps \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   3577\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mglobal_step \u001b[38;5;241m%\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mtorch_empty_cache_steps \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m   3578\u001b[0m ):\n",
      "File \u001b[0;32m~/miniconda3/envs/torch2.4/lib/python3.9/site-packages/transformers/trainer.py:3625\u001b[0m, in \u001b[0;36mTrainer.compute_loss\u001b[0;34m(self, model, inputs, return_outputs, num_items_in_batch)\u001b[0m\n\u001b[1;32m   3623\u001b[0m         loss_kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnum_items_in_batch\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m num_items_in_batch\n\u001b[1;32m   3624\u001b[0m     inputs \u001b[38;5;241m=\u001b[39m {\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39minputs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mloss_kwargs}\n\u001b[0;32m-> 3625\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   3626\u001b[0m \u001b[38;5;66;03m# Save past state if it exists\u001b[39;00m\n\u001b[1;32m   3627\u001b[0m \u001b[38;5;66;03m# TODO: this needs to be fixed and made cleaner later.\u001b[39;00m\n\u001b[1;32m   3628\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mpast_index \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m:\n",
      "File \u001b[0;32m~/miniconda3/envs/torch2.4/lib/python3.9/site-packages/torch/nn/modules/module.py:1501\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1496\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1497\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1498\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1499\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1500\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1502\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m   1503\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n",
      "File \u001b[0;32m~/miniconda3/envs/torch2.4/lib/python3.9/site-packages/transformers/models/bloom/modeling_bloom.py:994\u001b[0m, in \u001b[0;36mBloomForCausalLM.forward\u001b[0;34m(self, input_ids, past_key_values, attention_mask, head_mask, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict, cache_position, **deprecated_arguments)\u001b[0m\n\u001b[1;32m    992\u001b[0m labels \u001b[38;5;241m=\u001b[39m labels\u001b[38;5;241m.\u001b[39mto(lm_logits\u001b[38;5;241m.\u001b[39mdevice)\n\u001b[1;32m    993\u001b[0m \u001b[38;5;66;03m# Shift so that tokens < n predict n\u001b[39;00m\n\u001b[0;32m--> 994\u001b[0m shift_logits \u001b[38;5;241m=\u001b[39m \u001b[43mlm_logits\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m:\u001b[49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m:\u001b[49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcontiguous\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    995\u001b[0m shift_labels \u001b[38;5;241m=\u001b[39m labels[\u001b[38;5;241m.\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;241m.\u001b[39m, \u001b[38;5;241m1\u001b[39m:]\u001b[38;5;241m.\u001b[39mcontiguous()\n\u001b[1;32m    996\u001b[0m batch_size, seq_length, vocab_size \u001b[38;5;241m=\u001b[39m shift_logits\u001b[38;5;241m.\u001b[39mshape\n",
      "\u001b[0;31mOutOfMemoryError\u001b[0m: CUDA out of memory. Tried to allocate 498.00 MiB (GPU 0; 23.70 GiB total capacity; 11.46 GiB already allocated; 489.75 MiB free; 12.54 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF"
     ]
    }
   ],
   "source": [
    "trainer.train()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step8 Model inference"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pipe = Pipeline(\"text-generation\", model=model, tokenizer=tokenizer, device=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "question = \"Humsn:\\n{}\\n\\nAssistant:\\n\".format(\"考试有哪些技巧\")\n",
    "pipe(question, max_length=128, do_sample=True)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "torch2.4",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
